Libraries

if (!require("renv")) install.packages("renv")
Loading required package: renv

Attaching package: ‘renv’

The following objects are masked from ‘package:stats’:

    embed, update

The following objects are masked from ‘package:utils’:

    history, upgrade

The following objects are masked from ‘package:base’:

    autoload, load, remove
library(renv)
renv::restore()
- The library is already synchronized with the lockfile.
library(here)
here() starts at C:/Users/Marcony1/OneDrive - Fundacion Universidad de las Americas Puebla/Documents/MDS/Block 6/DSCI 532/DSCI_532_individual-assignment_marcony1
library(dplyr)
Warning: package ‘dplyr’ was built under R version 4.3.3
Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library(readr)
Warning: package ‘readr’ was built under R version 4.3.3

Read data

zip_file <- here("data", "raw", "iter_00_cpv2020_csv.zip")
temp_dir <- here("temp")
dir.create(temp_dir, showWarnings = FALSE)

unzip(zip_file, files = c("iter_00_cpv2020/conjunto_de_datos/conjunto_de_datos_iter_00CSV20.csv", "iter_00_cpv2020/diccionario_datos/diccionario_datos_iter_00CSV20.csv"), exdir = temp_dir)

data_path <- here(temp_dir,
                 "iter_00_cpv2020",
                 "conjunto_de_datos",
                 "conjunto_de_datos_iter_00CSV20.csv")

dict_path <- here(temp_dir,
                 "iter_00_cpv2020",
                 "diccionario_datos",
                 "diccionario_datos_iter_00CSV20.csv")

info_dict <- read_csv(dict_path)
New names:Rows: 290 Columns: 10── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (6): ...1, ...2, ...3, ...4, ...5, ...6
lgl (4): ...7, ...8, ...9, ...10
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- read_csv(data_path)
Rows: 195662 Columns: 286── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (283): ENTIDAD, NOM_ENT, MUN, NOM_MUN, LOC, NOM_LOC, LONGITUD, LATITUD, ALTITUD, POBFEM, POBMAS, P_0A2, P_0A2_F, P_0A2_M, P_3YMAS, P_3YMAS_F, P_3Y...
dbl   (3): POBTOT, VIVTOT, TVIVHAB
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
unlink(temp_dir, recursive = TRUE)
# Exporting dictionary file
write_csv(info_dict,
          here("data", "raw", "diccionario_datos_iter_00CSV20.csv"))

Exploration

head(df)
head(info_dict)
clean_info_dict <- info_dict[-c(1:3), ]
names(clean_info_dict) <- clean_info_dict[1, ]
clean_info_dict <- clean_info_dict[-1,]
clean_info_dict <- clean_info_dict[, -c(7:10)]


clean_info_dict
unique_states <- df |> 
      distinct(NOM_ENT)

unique_states

Selecting rows that we’ll analyze

rows_to_include <- c(1:12, 53:132, 136:140, 147, 155:211, 220:232)

filtered_data <- clean_info_dict |> 
      filter(row_number() %in% rows_to_include) |> 
      pull(4)

filtered_data
  [1] "ENTIDAD"    "NOM_ENT"    "MUN"        "NOM_MUN"    "LOC"        "NOM_LOC"    "LONGITUD"   "LATITUD"    "ALTITUD"    "POBTOT"     "POBFEM"    
 [12] "POBMAS"     "REL_H_M"    "POB0_14"    "POB15_64"   "POB65_MAS"  "P_0A4"      "P_0A4_F"    "P_0A4_M"    "P_5A9"      "P_5A9_F"    "P_5A9_M"   
 [23] "P_10A14"    "P_10A14_F"  "P_10A14_M"  "P_15A19"    "P_15A19_F"  "P_15A19_M"  "P_20A24"    "P_20A24_F"  "P_20A24_M"  "P_25A29"    "P_25A29_F" 
 [34] "P_25A29_M"  "P_30A34"    "P_30A34_F"  "P_30A34_M"  "P_35A39"    "P_35A39_F"  "P_35A39_M"  "P_40A44"    "P_40A44_F"  "P_40A44_M"  "P_45A49"   
 [45] "P_45A49_F"  "P_45A49_M"  "P_50A54"    "P_50A54_F"  "P_50A54_M"  "P_55A59"    "P_55A59_F"  "P_55A59_M"  "P_60A64"    "P_60A64_F"  "P_60A64_M" 
 [56] "P_65A69"    "P_65A69_F"  "P_65A69_M"  "P_70A74"    "P_70A74_F"  "P_70A74_M"  "P_75A79"    "P_75A79_F"  "P_75A79_M"  "P_80A84"    "P_80A84_F" 
 [67] "P_80A84_M"  "P_85YMAS"   "P_85YMAS_F" "P_85YMAS_M" "PROM_HNV"   "PNACENT"    "PNACENT_F"  "PNACENT_M"  "PNACOE"     "PNACOE_F"   "PNACOE_M"  
 [78] "PRES2015"   "PRES2015_F" "PRES2015_M" "PRESOE15"   "PRESOE15_F" "PRESOE15_M" "P3YM_HLI"   "P3YM_HLI_F" "P3YM_HLI_M" "P3HLINHE"   "P3HLINHE_F"
 [89] "P3HLINHE_M" "P3HLI_HE"   "P3HLI_HE_F" "P3HLI_HE_M" "PHOG_IND"   "POB_AFRO"   "POB_AFRO_F" "POB_AFRO_M" "PCON_DISC"  "PCON_LIMI"  "PSIND_LIM" 
[100] "P3A5_NOA"   "P3A5_NOA_F" "P3A5_NOA_M" "P6A11_NOA"  "P6A11_NOAF" "P6A11_NOAM" "P12A14NOA"  "P12A14NOAF" "P12A14NOAM" "P15A17A"    "P15A17A_F" 
[111] "P15A17A_M"  "P18A24A"    "P18A24A_F"  "P18A24A_M"  "P8A14AN"    "P8A14AN_F"  "P8A14AN_M"  "P15YM_AN"   "P15YM_AN_F" "P15YM_AN_M" "P15YM_SE"  
[122] "P15YM_SE_F" "P15YM_SE_M" "P15PRI_IN"  "P15PRI_INF" "P15PRI_INM" "P15PRI_CO"  "P15PRI_COF" "P15PRI_COM" "P15SEC_IN"  "P15SEC_INF" "P15SEC_INM"
[133] "P15SEC_CO"  "P15SEC_COF" "P15SEC_COM" "P18YM_PB"   "P18YM_PB_F" "P18YM_PB_M" "GRAPROES"   "GRAPROES_F" "GRAPROES_M" "PEA"        "PEA_F"     
[144] "PEA_M"      "PE_INAC"    "PE_INAC_F"  "PE_INAC_M"  "POCUPADA"   "POCUPADA_F" "POCUPADA_M" "PDESOCUP"   "PDESOCUP_F" "PDESOCUP_M" "PSINDER"   
[155] "PDER_SS"    "P12YM_SOLT" "P12YM_CASA" "P12YM_SEPA" "PCATOLICA"  "PRO_CRIEVA" "POTRAS_REL" "PSIN_RELIG" "TOTHOG"     "HOGJEF_F"   "HOGJEF_M"  
[166] "POBHOG"     "PHOGJEF_F"  "PHOGJEF_M" 
selected_df <- df |> 
      select(filtered_data)

selected_df

Exporting as parquet

# Export wrangled data as parquet file
table <- arrow::Table$create(selected_df)

output_dir <- here("data", "processed", "parquet_data")

arrow::write_dataset(table, output_dir, partitioning = c("NOM_ENT", "ENTIDAD"), existing_data_behavior = "overwrite")

Reading parquet

ds <- open_dataset(here("data", "processed", "parquet_data")) |> 
        collect()

ds
ds_puebla <- open_dataset(here("data", "processed", "parquet_data")) |>
    filter(NOM_ENT=="Puebla") |> 
    collect()

ds_puebla
ds_yucatan <- open_dataset(here("data", "processed", "parquet_data")) |>
    filter(NOM_ENT=="Yucatán") |> 
    collect()

ds_yucatan
ds_nuevo_leon <- open_dataset(here("data", "processed", "parquet_data")) |>
    filter(NOM_ENT=="Nuevo León") |> 
    collect()

ds_nuevo_leon
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCmF1dGhvcjogTWFyY28gUG9sbyBCcmF2byBNb250aWVsDQpkYXRlOiAyMDIwLTA0LTIxDQotLS0NCg0KIyMjIExpYnJhcmllcw0KDQpgYGB7cn0NCmlmICghcmVxdWlyZSgicmVudiIpKSBpbnN0YWxsLnBhY2thZ2VzKCJyZW52IikNCmxpYnJhcnkocmVudikNCnJlbnY6OnJlc3RvcmUoKQ0KbGlicmFyeShoZXJlKQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkocmVhZHIpDQpsaWJyYXJ5KGFycm93KQ0KYGBgDQoNCiMjIyBSZWFkIGRhdGENCg0KYGBge3J9DQp6aXBfZmlsZSA8LSBoZXJlKCJkYXRhIiwgInJhdyIsICJpdGVyXzAwX2NwdjIwMjBfY3N2LnppcCIpDQpgYGANCg0KYGBge3J9DQp0ZW1wX2RpciA8LSBoZXJlKCJ0ZW1wIikNCmRpci5jcmVhdGUodGVtcF9kaXIsIHNob3dXYXJuaW5ncyA9IEZBTFNFKQ0KDQp1bnppcCh6aXBfZmlsZSwgZmlsZXMgPSBjKCJpdGVyXzAwX2NwdjIwMjAvY29uanVudG9fZGVfZGF0b3MvY29uanVudG9fZGVfZGF0b3NfaXRlcl8wMENTVjIwLmNzdiIsICJpdGVyXzAwX2NwdjIwMjAvZGljY2lvbmFyaW9fZGF0b3MvZGljY2lvbmFyaW9fZGF0b3NfaXRlcl8wMENTVjIwLmNzdiIpLCBleGRpciA9IHRlbXBfZGlyKQ0KYGBgDQoNCmBgYHtyfQ0KDQpkYXRhX3BhdGggPC0gaGVyZSh0ZW1wX2RpciwNCiAgICAgICAgICAgICAgICAgIml0ZXJfMDBfY3B2MjAyMCIsDQogICAgICAgICAgICAgICAgICJjb25qdW50b19kZV9kYXRvcyIsDQogICAgICAgICAgICAgICAgICJjb25qdW50b19kZV9kYXRvc19pdGVyXzAwQ1NWMjAuY3N2IikNCg0KZGljdF9wYXRoIDwtIGhlcmUodGVtcF9kaXIsDQogICAgICAgICAgICAgICAgICJpdGVyXzAwX2NwdjIwMjAiLA0KICAgICAgICAgICAgICAgICAiZGljY2lvbmFyaW9fZGF0b3MiLA0KICAgICAgICAgICAgICAgICAiZGljY2lvbmFyaW9fZGF0b3NfaXRlcl8wMENTVjIwLmNzdiIpDQoNCmluZm9fZGljdCA8LSByZWFkX2NzdihkaWN0X3BhdGgpDQpkZiA8LSByZWFkX2NzdihkYXRhX3BhdGgpDQoNCg0KdW5saW5rKHRlbXBfZGlyLCByZWN1cnNpdmUgPSBUUlVFKQ0KYGBgDQoNCmBgYHtyfQ0KIyBFeHBvcnRpbmcgZGljdGlvbmFyeSBmaWxlDQp3cml0ZV9jc3YoaW5mb19kaWN0LA0KICAgICAgICAgIGhlcmUoImRhdGEiLCAicmF3IiwgImRpY2Npb25hcmlvX2RhdG9zX2l0ZXJfMDBDU1YyMC5jc3YiKSkNCg0KYGBgDQoNCiMjIyBFeHBsb3JhdGlvbg0KDQpgYGB7cn0NCmhlYWQoZGYpDQpoZWFkKGluZm9fZGljdCkNCmBgYA0KDQpgYGB7cn0NCmNsZWFuX2luZm9fZGljdCA8LSBpbmZvX2RpY3RbLWMoMTozKSwgXQ0KbmFtZXMoY2xlYW5faW5mb19kaWN0KSA8LSBjbGVhbl9pbmZvX2RpY3RbMSwgXQ0KY2xlYW5faW5mb19kaWN0IDwtIGNsZWFuX2luZm9fZGljdFstMSxdDQpjbGVhbl9pbmZvX2RpY3QgPC0gY2xlYW5faW5mb19kaWN0WywgLWMoNzoxMCldDQoNCg0KY2xlYW5faW5mb19kaWN0DQpgYGANCg0KYGBge3J9DQp1bmlxdWVfc3RhdGVzIDwtIGRmIHw+IA0KICAgICAgZGlzdGluY3QoTk9NX0VOVCkNCg0KdW5pcXVlX3N0YXRlcw0KYGBgDQoNCiMjIyBTZWxlY3Rpbmcgcm93cyB0aGF0IHdlJ2xsIGFuYWx5emUNCg0KYGBge3J9DQpyb3dzX3RvX2luY2x1ZGUgPC0gYygxOjEyLCA1MzoxMzIsIDEzNjoxNDAsIDE0NywgMTU1OjIxMSwgMjIwOjIzMikNCg0KZmlsdGVyZWRfZGF0YSA8LSBjbGVhbl9pbmZvX2RpY3QgfD4gDQogICAgICBmaWx0ZXIocm93X251bWJlcigpICVpbiUgcm93c190b19pbmNsdWRlKSB8PiANCiAgICAgIHB1bGwoNCkNCg0KZmlsdGVyZWRfZGF0YQ0KYGBgDQoNCmBgYHtyfQ0Kc2VsZWN0ZWRfZGYgPC0gZGYgfD4gDQogICAgICBzZWxlY3QoZmlsdGVyZWRfZGF0YSkNCg0Kc2VsZWN0ZWRfZGYNCmBgYA0KDQojIyMgRXhwb3J0aW5nIGFzIHBhcnF1ZXQNCg0KYGBge3J9DQojIEV4cG9ydCB3cmFuZ2xlZCBkYXRhIGFzIHBhcnF1ZXQgZmlsZQ0KdGFibGUgPC0gYXJyb3c6OlRhYmxlJGNyZWF0ZShzZWxlY3RlZF9kZikNCg0Kb3V0cHV0X2RpciA8LSBoZXJlKCJkYXRhIiwgInByb2Nlc3NlZCIsICJwYXJxdWV0X2RhdGEiKQ0KDQphcnJvdzo6d3JpdGVfZGF0YXNldCh0YWJsZSwgb3V0cHV0X2RpciwgcGFydGl0aW9uaW5nID0gYygiTk9NX0VOVCIsICJFTlRJREFEIiksIGV4aXN0aW5nX2RhdGFfYmVoYXZpb3IgPSAib3ZlcndyaXRlIikNCmBgYA0KDQojIyMgUmVhZGluZyBwYXJxdWV0DQoNCmBgYHtyfQ0KZHMgPC0gb3Blbl9kYXRhc2V0KGhlcmUoImRhdGEiLCAicHJvY2Vzc2VkIiwgInBhcnF1ZXRfZGF0YSIpKSB8PiANCiAgICAgICAgY29sbGVjdCgpDQoNCmRzDQpgYGANCg0KYGBge3J9DQpkc19wdWVibGEgPC0gb3Blbl9kYXRhc2V0KGhlcmUoImRhdGEiLCAicHJvY2Vzc2VkIiwgInBhcnF1ZXRfZGF0YSIpKSB8Pg0KICAgIGZpbHRlcihOT01fRU5UPT0iUHVlYmxhIikgfD4gDQogICAgY29sbGVjdCgpDQoNCmRzX3B1ZWJsYQ0KYGBgDQoNCmBgYHtyfQ0KZHNfeXVjYXRhbiA8LSBvcGVuX2RhdGFzZXQoaGVyZSgiZGF0YSIsICJwcm9jZXNzZWQiLCAicGFycXVldF9kYXRhIikpIHw+DQogICAgZmlsdGVyKE5PTV9FTlQ9PSJZdWNhdMOhbiIpIHw+IA0KICAgIGNvbGxlY3QoKQ0KDQpkc195dWNhdGFuDQpgYGANCg0KYGBge3J9DQpkc19udWV2b19sZW9uIDwtIG9wZW5fZGF0YXNldChoZXJlKCJkYXRhIiwgInByb2Nlc3NlZCIsICJwYXJxdWV0X2RhdGEiKSkgfD4NCiAgICBmaWx0ZXIoTk9NX0VOVD09Ik51ZXZvIExlw7NuIikgfD4gDQogICAgY29sbGVjdCgpDQoNCmRzX251ZXZvX2xlb24NCmBgYA0K